library(dplyr)
library(tidyverse)
library(mlr)
library(gridExtra)
library(ggfortify)
library(edgeR)
library(org.Hs.eg.db)
library(RColorBrewer)
library(gplots)
library(tidyr)
library(pheatmap)
library(reshape2)
library(survminer)
library(rms)
library(pheatmap)
library(ggplot2)
my_theme <- function(){
theme(
axis.text = element_text(size=8),
axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
axis.title = element_text(size =8),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#ffffff"),
strip.background = element_rect(fill = "black", color = "black", size =0.5),
strip.text = element_text(face = "bold", size = 8, color = "white"),
legend.position = "bottom",
legend.justification = "center",
legend.background = element_blank(),
panel.border = element_rect(color = "grey5", fill = NA, size = 0.5)
)
}
theme_set(my_theme())
mycolors=c("darkred","black","#004431","#000c44","#2d1600","purple","#202302")
myfillcolors=c("#d10c0c","grey10","#1e6651","#213989","#5b431a","purple","#393f03")
clin_data <- read.csv("sc3_Training_ClinAnnotations.csv")
## group age into bins for visualization purposes
labs <- c(paste(seq(20, 80, by=15), seq(20+15-1, 95-1, by=15), sep="-"))
clin_data$agegrp <- cut(clin_data$D_Age, breaks=c(seq(20,80, by=15), Inf), labels=labs, right=F)
## assumption: events where HR_FLAG it is not TRUE (event did not occur) are assumed to be censored
## 131 out of 583 events occurred and rest are censored
clin_data$HR_FLAG <- clin_data$HR_FLAG==TRUE
## check % of NAs present in each column of the dataset
nacount <- apply(clin_data, 2, function(x){
sum(is.na(x))
})
nacount <- as.data.frame(nacount, row.names=names(clin_data))
nacount <- add_rownames(nacount) %>% dplyr::mutate(percentNA=round(nacount/583*100,2))%>%
dplyr::filter(percentNA>=0.00)
## remove columns where NAs >50%
cols_to_remove <- nacount[nacount$percentNA>50,]$rowname
clin_data <- clin_data[, !(names(clin_data) %in% cols_to_remove)]
## remove unnecessary columns -> keep only D_PFS as time column, remove file info columns,
## study and patient status columns also removed (all same)
clin_data <- clin_data[, -c(1, 5:6, 8, 10:11,13:17)]
## remove NAs (only 20 in D_ISS column)
clin_data <- clin_data[complete.cases(clin_data),]
##plot % NAs
p <-ggplot(nacount, aes(x=rowname, y=percentNA))+geom_col(fill="blue")+coord_flip()+theme_minimal()+
ggtitle("Percentage of NAs in each column")
## age by gender distribution plot
q <- clin_data %>% ggplot(aes(x=agegrp, color=D_Gender, fill=D_Gender))+geom_bar()+
ggtitle("Age distribution by gender")+xlab("Age group [years]")+ylab("Number of subjects")+theme_minimal()
grid.arrange(p,q, ncol=2)
Data is checked for any missing values. Columns with more than 50% NAs are excluded from the analysis. For columns with less than 50% NAs (D_ISS), the corresponding observations with NAs are excluded. In the age by sex distribution plot it is observed that most subjects belong to middle to old age groups (50-80 years old) and there are more males than females in the dataset.
par(mfrow=c(1,2))
## Kaplan-Meier curve for events in patients stratified by age and ISS
surv_plots <- list()
fit_iss=survival::survfit(survival::Surv(D_PFS,HR_FLAG)~D_ISS,data=clin_data)
ggsurvplot(fit_iss, pval = TRUE, conf.int = TRUE)
#autoplot(fit_iss)+scale_color_manual(values=mycolors)+scale_fill_manual(values=myfillcolors)
fit_age = survival::survfit(survival::Surv(D_PFS,HR_FLAG)~agegrp,data=clin_data)
ggsurvplot(fit_age, pval = TRUE, conf.int = TRUE)
#autoplot(fit_age)+scale_color_manual(values=mycolors)+scale_fill_manual(values=myfillcolors)
These figures indicate that the patients who were in their third stage and belonged to an older age group have higher PFS risk. Now, that it’s established that both age and ISS can be important predictors of MM death or relapse, a small 2-predictor model is built to apply a Cox regression model and predict the PFS time using the main effects of the two predictors - age and ISS.
## simple 2-predictor model with coxph using only age and ISS
age_iss_model <- survival::coxph(survival::Surv(D_PFS,HR_FLAG) ~
D_Age + D_ISS , data=clin_data)
summary(age_iss_model)
## Call:
## survival::coxph(formula = survival::Surv(D_PFS, HR_FLAG) ~ D_Age +
## D_ISS, data = clin_data)
##
## n= 563, number of events= 122
##
## coef exp(coef) se(coef) z Pr(>|z|)
## D_Age 0.026554 1.026909 0.008763 3.030 0.00244 **
## D_ISS 0.581850 1.789346 0.122814 4.738 2.16e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## D_Age 1.027 0.9738 1.009 1.045
## D_ISS 1.789 0.5589 1.407 2.276
##
## Concordance= 0.662 (se = 0.025 )
## Likelihood ratio test= 39.91 on 2 df, p=2e-09
## Wald test = 38.42 on 2 df, p=5e-09
## Score (logrank) test = 40.15 on 2 df, p=2e-09
Again, it appears that both age and ISS have a statistically significant effect, as predictors in the model.
## simple 13-predictor model using only cyto features with coxph - to detect which cyto features could be relevant as significant predictors
cyto_feat_model <- survival::coxph(survival::Surv(D_PFS,HR_FLAG) ~
CYTO_predicted_feature_01+CYTO_predicted_feature_02+CYTO_predicted_feature_03+
CYTO_predicted_feature_05+CYTO_predicted_feature_06+CYTO_predicted_feature_08+
CYTO_predicted_feature_12+CYTO_predicted_feature_13+CYTO_predicted_feature_14+
CYTO_predicted_feature_15+CYTO_predicted_feature_16+CYTO_predicted_feature_17+
CYTO_predicted_feature_18, data=clin_data)
summary(cyto_feat_model)
## Call:
## survival::coxph(formula = survival::Surv(D_PFS, HR_FLAG) ~ CYTO_predicted_feature_01 +
## CYTO_predicted_feature_02 + CYTO_predicted_feature_03 + CYTO_predicted_feature_05 +
## CYTO_predicted_feature_06 + CYTO_predicted_feature_08 + CYTO_predicted_feature_12 +
## CYTO_predicted_feature_13 + CYTO_predicted_feature_14 + CYTO_predicted_feature_15 +
## CYTO_predicted_feature_16 + CYTO_predicted_feature_17 + CYTO_predicted_feature_18,
## data = clin_data)
##
## n= 563, number of events= 122
##
## coef exp(coef) se(coef) z Pr(>|z|)
## CYTO_predicted_feature_01 0.41477 1.51402 0.20968 1.978 0.04791 *
## CYTO_predicted_feature_02 0.74766 2.11204 0.73900 1.012 0.31168
## CYTO_predicted_feature_03 -1.01689 0.36172 0.33074 -3.075 0.00211 **
## CYTO_predicted_feature_05 0.18337 1.20126 0.21800 0.841 0.40024
## CYTO_predicted_feature_06 0.24968 1.28362 0.26331 0.948 0.34301
## CYTO_predicted_feature_08 0.27432 1.31563 0.29038 0.945 0.34481
## CYTO_predicted_feature_12 0.58770 1.79985 1.64586 0.357 0.72103
## CYTO_predicted_feature_13 -0.14107 0.86843 0.38509 -0.366 0.71413
## CYTO_predicted_feature_14 1.04206 2.83506 0.48209 2.162 0.03065 *
## CYTO_predicted_feature_15 0.02776 1.02815 0.27154 0.102 0.91856
## CYTO_predicted_feature_16 -0.51062 0.60012 1.01129 -0.505 0.61361
## CYTO_predicted_feature_17 NA NA 0.00000 NA NA
## CYTO_predicted_feature_18 -0.04945 0.95175 0.34209 -0.145 0.88506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## CYTO_predicted_feature_01 1.5140 0.6605 1.00382 2.2835
## CYTO_predicted_feature_02 2.1120 0.4735 0.49621 8.9896
## CYTO_predicted_feature_03 0.3617 2.7646 0.18917 0.6917
## CYTO_predicted_feature_05 1.2013 0.8325 0.78357 1.8416
## CYTO_predicted_feature_06 1.2836 0.7790 0.76613 2.1506
## CYTO_predicted_feature_08 1.3156 0.7601 0.74467 2.3244
## CYTO_predicted_feature_12 1.7998 0.5556 0.07150 45.3101
## CYTO_predicted_feature_13 0.8684 1.1515 0.40827 1.8473
## CYTO_predicted_feature_14 2.8351 0.3527 1.10205 7.2933
## CYTO_predicted_feature_15 1.0282 0.9726 0.60384 1.7506
## CYTO_predicted_feature_16 0.6001 1.6663 0.08269 4.3556
## CYTO_predicted_feature_17 NA NA NA NA
## CYTO_predicted_feature_18 0.9518 1.0507 0.48678 1.8608
##
## Concordance= 0.61 (se = 0.026 )
## Likelihood ratio test= 20.78 on 12 df, p=0.05
## Wald test = 20.07 on 12 df, p=0.07
## Score (logrank) test = 21.3 on 12 df, p=0.05
Cox regression model built using only the 13 cytogenetic features suggests that only 3 out of 13 features have statistically significant effect on PFS. Therefore, these 3 features will be selected for further analysis. Next, Kaplan-meier curves for these 3 features will be plotted.
par(mfrow=c(1,3))
fit_cyto1 = survival::survfit(survival::Surv(D_PFS,HR_FLAG)~CYTO_predicted_feature_01,data=clin_data)
fit_cyto3 = survival::survfit(survival::Surv(D_PFS,HR_FLAG)~CYTO_predicted_feature_03,data=clin_data)
fit_cyto14 = survival::survfit(survival::Surv(D_PFS,HR_FLAG)~CYTO_predicted_feature_14,data=clin_data)
ggsurvplot(fit_cyto1,
pval = TRUE, conf.int = TRUE)
ggsurvplot(fit_cyto3,
pval = TRUE, conf.int = TRUE)
ggsurvplot(fit_cyto14,
pval = TRUE, conf.int = TRUE)
Again, these plots suggest that the 3 features can be important predictors in our model, therefore they will be for the further downstream analysis.
## include only these 3 cyto features as they seem to have significant effect on survival
clin_data <- clin_data %>% dplyr::select(-c("CYTO_predicted_feature_02",
"CYTO_predicted_feature_05", "CYTO_predicted_feature_06", "CYTO_predicted_feature_08",
"CYTO_predicted_feature_12", "CYTO_predicted_feature_13",
"CYTO_predicted_feature_15", "CYTO_predicted_feature_16", "CYTO_predicted_feature_17",
"CYTO_predicted_feature_18"))
## Descriptive statistics
psych::describeBy(clin_data$D_PFS,group=clin_data$CYTO_predicted_feature_01)
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 425 547.42 320.8 521 527.44 326.17 1 1572 1571 0.56 0.01 15.56
## ----------------------------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 138 518.5 332.02 480 492.79 335.07 1 1497 1496 0.65 -0.06 28.26
psych::describeBy(clin_data$D_PFS,group=clin_data$CYTO_predicted_feature_03)
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 464 528.08 315.45 490 507.6 329.14 1 1556 1555 0.57 -0.12 14.64
## ----------------------------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 99 597.74 355.08 623 575.86 281.69 1 1572 1571 0.51 0.08 35.69
psych::describeBy(clin_data$D_PFS,group=clin_data$CYTO_predicted_feature_14)
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 551 541.49 325.73 517 520.02 327.65 1 1572 1571 0.57 -0.04 13.88
## ----------------------------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 12 487 198.91 512.5 485.6 294.3 237 751 514 -0.11 -1.73 57.42
psych::describeBy(clin_data$D_PFS,group=clin_data$agegrp)
##
## Descriptive statistics by group
## group: 20-34
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2 937.5 130.81 937.5 937.5 137.14 845 1030 185 0 -2.75 92.5
## ----------------------------------------------------------------------------
## group: 35-49
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 49 649.43 345.94 710 641.73 330.62 16 1373 1357 0.09 -0.79 49.42
## ----------------------------------------------------------------------------
## group: 50-64
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 233 552.45 304.7 541 539.07 299.49 1 1497 1496 0.38 -0.2 19.96
## ----------------------------------------------------------------------------
## group: 65-79
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 234 521.2 324.03 456.5 492.15 280.21 1 1572 1571 0.85 0.62 21.18
## ----------------------------------------------------------------------------
## group: 80-94
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 45 440.64 356.54 331 398.95 370.65 36 1433 1397 0.9 -0.1 53.15
psych::describeBy(clin_data$D_PFS,group=clin_data$D_ISS)
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 189 593.67 323.77 595 574.72 309.86 1 1572 1571 0.55 0.09 23.55
## ----------------------------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 209 534.31 311.37 510 518.53 318.76 1 1436 1435 0.48 -0.14 21.54
## ----------------------------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 165 486.87 330.68 427 456.62 363.24 1 1556 1555 0.76 0.08 25.74
The descriptive analysis of the relevant factors from the clinical data show that median PFS time is different across the different groups of features. It will be interesting to see how these features combined with expression data perform in the predictive analysis and contribute to the prediction of death or relapse risk in newly diagnosed MM patients.
Next, expression dataset will be explored and later integrated with the clinical data to train a predictive model. The expression dataset is filtered for only those MM relevant genes based on literature research. Furthermore, only those samples for which clinical data is also available will be used for the further analysis. This way it becomes easier to integrate the two datasets to build a predictive model.
exp_data <- read.csv("MMRF_CoMMpass_IA9_E74GTF_Salmon_entrezID_TPM_hg19.csv")
## list the risk genes - based on literature research
riskgenes <- c("KRAS", "NRAS", "DIS3", "TENT5C",
"BRAF","HUWE1", "TP53", "TRAF3",
"EGR1", "ATM", "H1-4", "FGFR3", "UBR5", "PRKD2", "CYLD", "ACTG1",
"IRF4", "MAX", "KMT2C", "CREBBP", "CCND1", "ARID1A", "EPAS1",
"ERC2", "PRC1", "CSGALNACT1", "PHF19", "NSD2")
# Remove first column from expdata - countdata, contains only the counts for all samples.
countdata_initial <- exp_data[,-1]
# Store EntrezGeneID as rownames
rownames(countdata_initial) <- exp_data[,1]
# make dgelist object
dge_initial <- DGEList(countdata_initial)
## get annotations for entrez ids
ann <- select(org.Hs.eg.db,keys=rownames(dge_initial$counts),columns=c("ENTREZID","SYMBOL","GENENAME"))
##double check that the ENTREZID column matches exactly to our rownames in our count data.
print(paste0("Entrez IDs in expression data = Entrez IDs in count data: True for all ", table(ann$ENTREZID==rownames(dge_initial$counts)), " Entrez IDs"))
## [1] "Entrez IDs in expression data = Entrez IDs in count data: True for all 24128 Entrez IDs"
## add this information to expression dataset
exp_data$genes <- ann$SYMBOL
## filter expression dataset for risk genes
mm_risk <- exp_data %>% dplyr::filter(genes %in% riskgenes)
## filter for samples where clinical info also available
mm_risk_expdata <- mm_risk[, colnames(mm_risk) %in% clin_data$RNASeq_transLevelExpFileSamplId]
## bring columns in expression data in the correct order as in the clinical dataset
mm_risk_expdata <- mm_risk_expdata[clin_data$RNASeq_transLevelExpFileSamplId]
##rename entrez id and gene columns
mm_risk_expdata$Entrez_Id <- mm_risk$X
mm_risk_expdata$Gene <- mm_risk$genes
## make new countdata using the filtered dataset
countdata <- mm_risk_expdata[,-(564:565)]
# Store EntrezGeneID as rownames
rownames(countdata) <- mm_risk_expdata[,564]
##Make sure that the column names are now the same as sample name in the clinical data file. This is good because it means our sample information in clinical data is in the same order as the columns in countdata
print(paste0("Column names = Sample names: True for all ", table(colnames(countdata)==clin_data$RNASeq_transLevelExpFileSamplId), " samples"))
## [1] "Column names = Sample names: True for all 563 samples"
## make a dgelist object for visualizations, qc checks
dge <- DGEList(countdata)
Here, as I have subsetted the expression dataset for MM relevant genes, the filtering of lowly expressed genes (which is an important step in a standard RNASeq analysis pipeline), will be skipped. So, as next step, quality control on all the samples in the dataset will be performed to check that the data is of good quality, and that the samples are as we would expect.
## add clinical covariates (age+sex) to DGEList object
group_age <- factor(clin_data$agegrp)
group_sex <- factor(clin_data$D_Gender)
# Add the group information into the DGEList
dge$samples$age <- group_age
dge$samples$gender <- group_sex
## qc checks
## library size and distribution
barplot(dge$samples$lib.size/1e06, names=colnames(dge), las=2, ann=FALSE, cex.names=0.2)
mtext(side = 1, text = "Samples", line = 4)
mtext(side = 2, text = "Library size (millions)", line = 3)
title("Barplot of library sizes")
# examine the distributions of the raw counts using log of the counts.
# Get log2 counts per million
logcounts <- cpm(dge,log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2, cex.axis=0.2)
# Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")
title("Boxplots of logCPMs (unnormalised)")
From the boxplots it is oberved that the overall distributions of raw log-intensities are not identical but still not very different. If a sample is really far above or below the blue horizontal there my be a need to investigate that sample further. Therefore, for now all the samples will be kept. Next, multi-dimensional scaling (MDS) plots will be used as they are an incredibly useful tool for quality control and checking for outliers.
## multi-dimensional scaling plots
# set up colour schemes for age and gender
par(mfrow=c(1,2))
col.sex <- c("purple","orange")[as.factor(clin_data$D_Gender)]
col.age <- c("blue","red","black","green", "purple", "orange")[clin_data$agegrp]
#MDS with disease stage colouring
plotMDS(dge,col=col.sex)
# Let's add a legend to the plot so we know which colours correspond to which stage
legend("topleft",fill=c("purple","orange"),legend=levels(as.factor(clin_data$D_Gender)))
# Add a title
title("Sex")
# Similarly for age
plotMDS(dge,col=col.age)
legend("topleft",fill=c("blue","red","black","green", "purple", "orange"),legend=levels(clin_data$agegrp),cex=0.5)
title("Age")
No abnormalities are observed across the samples, as there are no clear outliers in the MDS plots. However, one would expect to observe a clear clustering of same age or sex groups and this is something that is not observed in these plots. Still, all the samples in the dataset will be kept for further downstream analysis, as none of them stand out in any way.
Next, the two datasets will be merged.
## reshape the expression dataset so that genes become column names
final_exp_data <- mm_risk_expdata[,-564] %>%
gather(key = key, value = value, 1:563) %>%
spread(key = names(mm_risk_expdata)[565], value = "value")
## rename the sample id column to merge with clinical data in the next step
final_exp_data <- final_exp_data %>% dplyr::rename(RNASeq_transLevelExpFileSamplId=key)
## merge clinical and expression data
clin_exp_merge <- merge(x=clin_data, y=final_exp_data, by="RNASeq_transLevelExpFileSamplId", all.x=T)
## change column names not suitable for R
clin_exp_merge <- clin_exp_merge %>% dplyr::rename(H1_4 = "H1-4")
## correlation of genes with event time
corres_genes <- cor(clin_exp_merge[,c(5, 12:39)], method="pearson", use="pairwise.complete.obs")
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(corres_genes)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# plot Heatmap
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
geom_text(aes(label = round(value, 2)), size=1.5)+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
The correlation of MM relevant gene expression levels with PFS are represented in the heatmap. Positive correlations are red and negative correlations are blue. The plot indicates that some genes, namely CCND1, H1_4, HUWE1, KRAS, MAX, NRAS, PHF19, PRC1, TENT5C, TP53 and UBR5 show negative correlation with PFS. It will be interesting to see if these features will appear to be important predictors and how they impact the survival risk.
## remove irrelevant columns from the dataset
train_data <- clin_exp_merge %>% dplyr::select(-c("RNASeq_transLevelExpFileSamplId", "Patient", "agegrp", "D_Gender"))
## generate random forest SRC importance plots
rfi=generateFilterValuesData(surv.task,method=c("randomForestSRC_importance"))%>%.$data%>%ggplot(aes(x=reorder(name,value),y=value,fill=reorder(name,value)))+geom_bar(alpha=0.8,stat="identity",color="black",show.legend=F)+scale_x_discrete("Features")+coord_flip()
## generate Cox-PH model univariate scores
uv=generateFilterValuesData(surv.task,imp.learner=makeLearner("surv.coxph"),method="univariate.model.score")%>%.$data%>%ggplot(aes(x=reorder(name,value),y=value,fill=reorder(name,value)))+geom_bar(alpha=0.8,stat="identity",color="black",show.legend=F)+scale_x_discrete("Features")+coord_flip()
## plot
grid.arrange(rfi,uv, ncol=2)
Results show some features are not important predictors, however, they still might contribute to the PFS risk (based on the univariate model). Thus, all features from the dataset will be included in our model.
Based on 5-fold cross-validation for comparing the performance of the 2 survival analysis algorithms
## make survival task
surv.task <- makeSurvTask(data=train_data, target = c("D_PFS", "HR_FLAG"))
## z-score normalize all numeric features
surv.task <- normalizeFeatures(surv.task, method="standardize", cols=c("D_Age", "ACTG1", "ARID1A", "ATM",
"BRAF", "CCND1", "CREBBP", "CSGALNACT1", "CYLD", "DIS3", "EGR1",
"EPAS1", "ERC2", "FGFR3", "H1_4", "HUWE1", "IRF4", "KMT2C", "KRAS",
"MAX", "NRAS", "NSD2", "PHF19", "PRC1", "PRKD2", "TENT5C", "TP53",
"TRAF3", "UBR5"))
#Benchmark study : comparing 2 different algorithms for survival task
# Creating a list of 2 learners
svlearners=list(
makeLearner("surv.coxph"),
makeLearner("surv.randomForestSRC")
)
## 5-fold cross-validation
rdesc=makeResampleDesc("CV", iters=5, predict = "both")
# Initalising the Benchmark study
set.seed(123)
svbnmrk=benchmark(svlearners,surv.task,rdesc)
bmrkdata=getBMRPerformances(svbnmrk, as.df = TRUE)%>%.[,-1]
## plot and check the performance of the two learners
## Distribution of C-index of 2 survival algorithms, based on a 5-fold cross-validation
par(mfrow=c(1,2))
plotBMRRanksAsBarChart(svbnmrk)+scale_fill_manual(values=myfillcolors)
ggplot(bmrkdata)+geom_path(aes(x=iter,y=cindex,color=learner.id),size=1,alpha=0.8)+facet_wrap(~learner.id,scales="free",ncol=2)+scale_color_manual(values=myfillcolors)
Both learners perform equally well, however, proceed with the random forest SRC model, as from my previous experience, RF provides more interpretative results and can identify non-linear effects of all features.
## make final learner (random forest SRC)
svlearner <- makeLearner("surv.randomForestSRC")
## set parameter space
params <- makeParamSet(makeDiscreteParam("mtry", c(2,4,6,8,10)), makeDiscreteParam("ntree", c(3,10,50,100,300,1000)))
## define type of search (GridsearchCV)
ctrl <- makeTuneControlGrid()
## define CV scheme (5-fold CV)
rdesc=makeResampleDesc("CV", iters=5, predict = "both")
set.seed(2356)
## tune hyperparameters
tune <- tuneParams(learner=svlearner, task=surv.task, resampling=rdesc, par.set=params, control = ctrl,
show.info=T)
## change params (take only optimal params)
svlearner_tune <- setHyperPars(makeLearner("surv.randomForestSRC"), mtry=tune$x$mtry, ntree=tune$x$ntree)
svlearner_tune$par.vals <- list(importance=T)
## resampling - to anticipate how the model would perform on a new dataset
## the CV scheme in mlr by default makes 80/20 split of the training dataset
cv <- resample(svlearner_tune, surv.task, rdesc, measures=cindex, models=T, extract=getFeatureImportance)
#train a model
rforestsrc <- train(svlearner_tune, surv.task)
par(mfrow=c(1,2))
## performance evaluation - check c-index across 5 iterations
c_index_df <- cv$measures.test
##plot
c_index_plot <- ggplot(c_index_df)+geom_path(aes(x=iter,y=cindex),size=1,alpha=0.8)+scale_color_manual(values=myfillcolors)
## get feature importance using the cross validation set (average of all models)
imp_df_cv <- rbind(cv$extract[[1]]$res,
cv$extract[[2]]$res,
cv$extract[[3]]$res,
cv$extract[[4]]$res,
cv$extract[[5]]$res)%>% dplyr::group_by(variable) %>% dplyr::summarise(mean=mean(importance), sd=sd(importance)) %>% arrange(desc(mean))
imp_plot <- ggplot(imp_df_cv, aes(x=reorder(variable, mean), y=mean, fill=mean))+
geom_bar(stat="identity", position="dodge")+coord_flip()+
ylab("Variable importance")+xlab("")+ggtitle("Information Value Summary")+
guides(fill=F)+scale_fill_gradient(low="red", high="blue")+theme_minimal()
grid.arrange(c_index_plot, imp_plot, ncol=2)
par(mfrow=c(2,3))
## first:divide patients into two groups using gene expression median as a cutoff
clin_exp_merge = clin_exp_merge %>%
mutate(PHF19_exp = case_when(
PHF19 > quantile(PHF19, 0.5) ~ 'PHF19_High',
PHF19 < quantile(PHF19, 0.5) ~ 'PHF19_Low',
TRUE ~ NA_character_
))
clin_exp_merge = clin_exp_merge %>%
mutate(PRC1_exp = case_when(
PRC1 > quantile(PRC1, 0.5) ~ 'PRC1_High',
PRC1< quantile(PRC1, 0.5) ~ 'PRC1_Low',
TRUE ~ NA_character_
))
clin_exp_merge = clin_exp_merge %>%
mutate(MAX_exp = case_when(
MAX > quantile(MAX, 0.5) ~ 'MAX_High',
MAX< quantile(MAX, 0.5) ~ 'MAX_Low',
TRUE ~ NA_character_
))
clin_exp_merge = clin_exp_merge %>%
mutate(CCND1_exp = case_when(
CCND1 > quantile(CCND1, 0.5) ~ 'CCND1_High',
CCND1< quantile(CCND1, 0.5) ~ 'CCND1_Low',
TRUE ~ NA_character_
))
clin_exp_merge = clin_exp_merge %>%
mutate(CSGALNACT1_exp = case_when(
CSGALNACT1> quantile(CSGALNACT1, 0.5) ~ 'CSGALNACT1_High',
CSGALNACT1< quantile(CSGALNACT1, 0.5) ~ 'CSGALNACT1_Low',
TRUE ~ NA_character_
))
fit_phf19 = survival::survfit(Surv(D_PFS, HR_FLAG) ~ PHF19_exp, data = clin_exp_merge)
fit_prc1 = survival::survfit(Surv(D_PFS, HR_FLAG) ~ PRC1_exp, data = clin_exp_merge)
fit_max = survival::survfit(Surv(D_PFS, HR_FLAG) ~ MAX_exp, data = clin_exp_merge)
fit_ccnd1 = survival::survfit(Surv(D_PFS, HR_FLAG) ~ CCND1_exp, data = clin_exp_merge)
fit_csg = survival::survfit(Surv(D_PFS, HR_FLAG) ~ CSGALNACT1_exp, data = clin_exp_merge)
## plot
ggsurvplot(fit_phf19, pval = T, conf.int = T)
ggsurvplot(fit_prc1, pval = T, conf.int = T)
ggsurvplot(fit_max, pval = T, conf.int = T)
ggsurvplot(fit_ccnd1, pval = T, conf.int = T)
ggsurvplot(fit_csg, pval = T, conf.int = T)